library(dplyr)
library(tidyverse)
library(readxl)

# setwd("D:PhD/01_Data/03_Vitro/04_Colonization difference between WT and KO") # Windows
# setwd("/Volumes/Yiming_Wang/PhD/01_Data/03_Vitro/04_Colonization difference between WT and KO") # Mac
setwd("/Users/godric/Desktop/Fut2 review comments/Revision") #Macbook pro

df.all <- read_excel("Data for AUC.xlsx")
# First get rid of negative data
# Then gather data to have long table 
# Then separate type to get hr and type 
df.1<-df.all%>%
  filter(Genotype!="Negative")%>%
  gather(type,value,-c(No,ID,Genotype,Sex,GenotypeSex,Cage))%>% # convert the original table to long table, create two new columns and put them after -c()
  separate(type,c("hr","Treatment"),remove=T) # Seprate type into two columns, e.g. "24hr_NG", separate function can separate the item which has non-alphabet/number symbol into two parts

df.2 <- df.1 %>% 
  arrange(ID,hr) %>% 
  group_by(ID,hr) %>% 
  mutate(OD_difference = value - lag(value)) %>% 
  filter(!is.na(OD_difference)) %>% 
  select(ID, Genotype, Sex, GenotypeSex, hr, OD_difference)

summary <- df.2 %>%
  group_by(Genotype,hr) %>%
  summarise(value.mean=mean(OD_difference), sd=sd(OD_difference))%>%
  ungroup()

# min line is for ribbon in figure
min.line<-summary%>%
  select(-sd)%>%
  mutate(hr = as.numeric(gsub("hr","",gsub("X","",hr)))) %>%
  spread(Genotype,value.mean)%>%
  group_by(hr)%>%
  mutate(min_line=min(WT,KO))%>%
  select(hr,min_line)

summary.final<-summary %>%
  mutate(hr.new = as.numeric(gsub("hr","",gsub("X","",hr))))%>% 
  left_join(min.line,by=c("hr.new"="hr"))


# Gather negative control data
neg.ctrl<-df.all %>%
  filter(Genotype == "Negative")%>%
  gather(type,value,-c(No,ID,Genotype,Sex,GenotypeSex,Cage))%>%
  separate(type,c("hr","Treatment"),remove=T)%>%
  group_by(ID,hr) %>%
  mutate(OD_difference = value - lag(value)) %>% 
  filter(!is.na(OD_difference)) %>% 
  select(ID, Genotype, Sex, GenotypeSex, hr, OD_difference) %>% 
  group_by(Genotype,hr) %>%
  summarise(value.mean=mean(OD_difference),sd=sd(OD_difference))%>% #Summarize mean and standard deviation for future figure use
  ungroup()%>%
  mutate(hr.new=as.numeric(gsub("hr","",gsub("X","",hr))))%>%
  mutate(min_line=0)
  # select(Genotype,hr,Treatment,value.mean)%>%
  # rename(Genotype=ID)%>%
  # mutate(sd=0)#The negative control is under ID column, to bind negative control and my other data, the ID column should be renamed to Genotype

#bind data 
summary.final2<-rbind(summary.final,neg.ctrl)

# 
# summary.final2<-summary.final2%>%
#   mutate(group=ifelse(group=="G KO","KO with 2'-FL",
#                          ifelse(group=="NG KO","KO without glycan",
#                                 ifelse(group=="G WT", "WT with 2'-FL",
#                                        ifelse(group=="NG WT", "WT without glycan",
#                                               ifelse(group=="G Negative", "Negative control with 2'-FL",
#                                                      "Negative control without glycan")))))) %>%
#   mutate(group=factor(group,levels=c("KO with 2'-FL","KO without glycan",
#                                      "WT with 2'-FL", "WT without glycan",
#                                      "Negative control with 2'-FL", "Negative control without glycan")))# Factor group

  
  

calculus<-summary.final %>%
  filter(Genotype!="Negative") %>%
  group_by(Genotype) %>%
  arrange(Genotype,hr.new)%>% #Sort your data based on ID,Treatment,hr.new
  mutate(sum.value=value.mean+lag(value.mean), #sum.value is the summary of the two base value
         hr.interval=hr.new-lag(hr.new))%>% # hr.interval is the height of the trapezoid
  mutate(area=sum.value*hr.interval/2)%>% #this area is the total area of the highest curve
  # summarise(area.total=sum(area,na.rm = T))%>% # Summarize different timepoint area
  ungroup() %>%
  group_by(Genotype)%>%
  summarise(total.area=sum(area,na.rm=T))%>%
  ungroup()%>%
  mutate(area.difference=total.area-lag(total.area))%>%
  ungroup()%>%
  filter(!is.na(area.difference))%>%
  select(area.difference)#You can't rejoin the dataset because this is summarised and no id is there. 

Summary Line Charts

library(ggplot2)
library(ggthemes)
library (gridExtra)

summary.final2 <- summary.final2 %>% 
  mutate(Genotype=ifelse(Genotype == "WT", "WT",
                         ifelse(Genotype == "KO", "KO", "Media control")))

level_order_genotype <- c("WT","KO", "Media control")
# WT vs KO 
figure <- ggplot(data=subset(summary.final2), aes(x=hr.new,y=value.mean,group=factor(Genotype, level=level_order_genotype)))+
  geom_line(size=1,aes(color=factor(Genotype, level=level_order_genotype))) +
      geom_point(size=2,aes(color=factor(Genotype, level=level_order_genotype),shape=factor(Genotype, level=level_order_genotype))) +
      geom_pointrange(aes(ymin=value.mean-sd, ymax=value.mean+sd,color=factor(Genotype, level=level_order_genotype))) + # error bar method 1 (just line)
      theme_bw() +
      labs(x="Time (hr)",y=expression(OD[600]~ difference),caption = "",
           title = "")+ # Faeces collected from KO mice, you can use "title = paste0(k,"+ Negative")
      theme(legend.title = element_blank(),
            legend.position = "bottom",
            legend.text = element_text(size=12),
            axis.text.x=element_text(colour="black", face="plain", size=13), 
            axis.text.y=element_text(colour="black", face="plain", size=13),
            axis.title.x=element_text(margin = margin(t = 5), size=13),
            axis.title.y=element_text(margin = margin(r = 5), size=13),
            axis.title = element_text(face="plain", size = 11),
            plot.title = element_text(size=10, hjust=0.5),
            panel.grid = element_blank(),
            panel.border =  element_rect(color = "black",
                                    fill = NA,
                                    size = 1),
            aspect.ratio = 1,
            plot.background = element_rect(fill = "transparent", colour = "transparent"))+
      scale_y_continuous(limits = c(-0.01,0.5),breaks = seq(0,0.5,0.1))+
      scale_x_continuous(limits=c(0,120),breaks=seq(0,120,24))+
      scale_colour_manual(values=c("#3C5488B2","#7E6148B2","#3B3B3BB2")) +
      geom_ribbon(aes(ymax=value.mean, ymin=min_line),fill="#cfcfcf",alpha = 0.2)+ # ivory3, seashell3, gray66, #87c0e6, #ffa0aa, #808080
      annotate("text",x=60,y=0.5,
             label=expression(italic("P") == "0.0051"),hjust=0.5, size=4.5)

figure

Calculate area of each sample

neg.ctrl2<-neg.ctrl%>%
  rename(ID=1)%>% # Rename the first column name as ID
  mutate(hr.new = as.numeric(gsub("hr","",gsub("X","",hr)))) # Create a new column with min.line=0

neg.ctrl2 = subset(neg.ctrl2, select=-c(hr, sd)) %>% 
  rename (hr=hr.new) %>% 
  rename (value=value.mean) %>% 
  mutate (Genotype = "Negative") %>% 
  select(ID, hr, Genotype, value)
  

data.individual.final<-df.1 %>%
  select(ID,hr,Genotype, Treatment,value) %>%
  spread(Treatment,value) %>% # Transform the table to normal table (wide)
  mutate(hr = as.numeric(gsub("hr","",gsub("X","",hr))))%>%
  group_by(ID,hr)%>% 
  mutate(value = G-NG) %>% 
  select(ID, hr, Genotype, value) %>% 
  rbind(neg.ctrl2)

# min.line<-summary%>%
#   select(-sd)%>%
#   mutate(hr = as.numeric(gsub("hr","",gsub("X","",hr)))) %>%
#   spread(Genotype,value.mean)%>%
#   group_by(hr)%>%
#   mutate(min_line=min(WT,KO))%>%
#   select(hr,min_line)

calculus<-data.individual.final %>%
  filter(ID!="Negative") %>%
  group_by(ID,Genotype) %>%
  arrange(ID,Genotype,hr)%>% #Sort your data based on ID,Treatment,hr.new
  mutate(sum.value=value+lag(value), #sum.value is the summary of the two base value
         hr.interval=hr-lag(hr))%>% # hr.interval is the height of the trapezoid
  mutate(area=sum.value*hr.interval/2)%>% #this area is the total area of the highest curve
  summarise(area.total=sum(area,na.rm = T))%>% # Summarize different timepoint area
  ungroup() 

Statistical analysis

Normality test

set.seed(123)
calculus_WT <- calculus %>% 
  filter(Genotype == "WT")

calculus_KO <- calculus %>% 
  filter(Genotype == "KO")

# Choosing a normality test
  ## Answer: D'Agostino-Pearson normality test ("omnibus K2" test)
  ## Reasons: It first computes the skewness and kurtosis to quantify how far the distribution is from Gaussian in terms of asymmetry and shape.  It then calculates how far each of these values differs from the value expected with a Gaussian distribution, and computes a single P value from the sum of these discrepancies. It is a versatile and powerful normality test. (https://www.graphpad.com/guides/prism/latest/statistics/stat_choosing_a_normality_test.htm)

# D'Agostino-Pearson normality test ("omnibus K2" test)
  ##Null (H0): The data are normally distributed Alternate/the data are sampled from a Gaussian distribution
  ##The low p-value (<0.05) indicates you reject that null hypothesis and so accept the alternative hypothesis that the data are not sampled from a Gaussian population
  ## https://www.rdocumentation.org/packages/PoweR/versions/1.0.7/topics/statcompute

# Example 
library(PoweR)
getindex() # index 6 is D'Agostino-Pearson normality test ("omnibus K2" test)
##    Index                            Law Nbparams Default1 Default2 Default3
## 1      1                  Laplace(mu,b)        2 0.000000        1       NA
## 2      2               Normal(mu,sigma)        2 0.000000        1       NA
## 3      3               Cauchy(mu,sigma)        2 0.000000        1       NA
## 4      4             Logistic(mu,sigma)        2 0.000000        1       NA
## 5      5              Gamma(shape,rate)        2 2.000000        1       NA
## 6      6                      Beta(a,b)        2 1.000000        1       NA
## 7      7                   Uniform(a,b)        2 0.000000        1       NA
## 8      8                  Student-t(df)        1 1.000000       NA       NA
## 9      9                Chi-squared(df)        1 1.000000       NA       NA
## 10    10       Lognormal(logmean,logsd)        2 0.000000        1       NA
## 11    11           Weibull(shape,scale)        2 1.000000        1       NA
## 12    12             ShiftedExp(l,rate)        2 0.000000        1       NA
## 13    13                        U^{j+1}        1 1.000000       NA       NA
## 14    14                 AveUnif(k,a,b)        3 2.000000        0      1.0
## 15    15                       UUnif(j)        1 1.000000       NA       NA
## 16    16                       VUnif(j)        1 1.000000       NA       NA
## 17    17           JSU(mu,sigma,nu,tau)        4 0.000000        1      0.0
## 18    18                       Tukey(l)        1 1.000000       NA       NA
## 19    19                    LoConN(p,m)        2 0.200000        3       NA
## 20    20                       JSB(g,d)        2 0.000000        1       NA
## 21    21          SkewN(xi,omega,alpha)        3 0.000000        1      0.0
## 22    22                    ScConN(p,d)        2 0.200000        2       NA
## 23    23                GP(mu,sigma,xi)        3 0.000000        1      0.0
## 24    24                GED(mu,sigma,p)        3 0.000000        1      1.0
## 25    25        Stable(alpha,beta,c,mu)        4 1.000000        0      1.0
## 26    26               Gumbel(mu,sigma)        2 1.000000        1       NA
## 27    27        Frechet(mu,sigma,alpha)        3 0.000000        1      1.0
## 28    28               GEV(mu,sigma,xi)        3 0.000000        1      0.0
## 29    29                GArcSine(alpha)        1 0.500000       NA       NA
## 30    30                FoldN(mu,sigma)        2 0.000000        1       NA
## 31    31                    MixN(p,m,d)        3 0.500000        0      1.0
## 32    32                    TruncN(a,b)        2 0.000000        1       NA
## 33    33                        Nout(a)        1 1.000000       NA       NA
## 34    34             GEP(t1,t2,t3,crit)        4 0.500000        0      1.0
## 35    35            Exponential(lambda)        1 1.000000       NA       NA
## 36    36               ALaplace(mu,b,k)        3 0.000000        1      2.0
## 37    37       NIG(alpha,beta,delta,mu)        4 1.000000        0      1.0
## 38    38    APD(theta,phi,alpha,lambda)        4 0.000000        1      0.5
## 39    39 modAPD(mu,sigma,theta1,theta2)        4 0.000000        1      0.5
## 40    40           LPtn(alpha,mu,sigma)        3 1.959964        0      1.0
##    Default4
## 1        NA
## 2        NA
## 3        NA
## 4        NA
## 5        NA
## 6        NA
## 7        NA
## 8        NA
## 9        NA
## 10       NA
## 11       NA
## 12       NA
## 13       NA
## 14       NA
## 15       NA
## 16       NA
## 17    5e-01
## 18       NA
## 19       NA
## 20       NA
## 21       NA
## 22       NA
## 23       NA
## 24       NA
## 25    0e+00
## 26       NA
## 27       NA
## 28       NA
## 29       NA
## 30       NA
## 31       NA
## 32       NA
## 33       NA
## 34    1e-06
## 35       NA
## 36       NA
## 37    0e+00
## 38    2e+00
## 39    2e+00
## 40       NA
##    Index                Stat Alter Nbparams
## 1      1                 K-S     3        0
## 2      2                AD^*     3        0
## 3      3                 Z_C     3        0
## 4      4                 Z_A     3        0
## 5      5                 P_S     3        0
## 6      6                 K^2     3        0
## 7      7                  JB     3        0
## 8      8                  DH     3        0
## 9      9                 RJB     3        0
## 10    10            T_{Lmom}     3        0
## 11    11      T_{Lmom}^{(1)}     3        0
## 12    12      T_{Lmom}^{(2)}     3        0
## 13    13      T_{Lmom}^{(3)}     3        0
## 14    14            BM_{3-4}     3        0
## 15    15            BM_{3-6}     3        0
## 16    16           T_{MC-LR}     3        0
## 17    17                 T_w 0,1,2        0
## 18    18       T_{MC-LR}-T_w     3        0
## 19    19             T_{S,5}     3        0
## 20    20             T_{K,5}     3        0
## 21    21                   W     4        0
## 22    22                  W'     4        0
## 23    23            tilde{W}     4        0
## 24    24                   D 0,1,2        0
## 25    25                   r     4        0
## 26    26                  CS     3        0
## 27    27                   Q 0,1,2        0
## 28    28                Q-Q* 0,1,2        0
## 29    29                BCMR     3        0
## 30    30            beta_3^2     3        0
## 31    31          T^*(alpha)     1        1
## 32    32                 I_n     3        0
## 33    33              R_{sJ}     3        0
## 34    34                  Q* 0,1,2        0
## 35    35                 R_n     3        0
## 36    36             X_{APD}     3        0
## 37    37             Z_{EPD} 0,1,2        0
## 38    38                 GLB     3        0
## 39    39              V_3-ML 0,1,2        0
## 40    40              V_4-ML 0,1,2        0
## 41    41                   S     3        0
## 42    42                 A^2     3        0
## 43    43                 W^2     3        0
## 44    44                 U^2     3        0
## 45    45            sqrt{n}D     3        0
## 46    46                   V     3        0
## 47    47    T_{n,a}^{(1)}-MO     3        1
## 48    48    T_{n,a}^{(1)}-ML     3        1
## 49    49    T_{n,a}^{(2)}-MO     3        1
## 50    50    T_{n,a}^{(2)}-ML     3        1
## 51    51         T_{m,n}^{V}     4        1
## 52    52         T_{m,n}^{E}     4        1
## 53    53         T_{m,n}^{C}     4        1
## 54    54            hat{G}_n     3        0
## 55    55                 V_3 0,1,2        0
## 56    56                 V_4 0,1,2        0
## 57    57                 K_1     3        0
## 58    58                   T     3        0
## 59    59                   Z     3        0
## 60    60                   K     3        0
## 61    61              DLLap1 0,1,2        0
## 62    62              DLLap2 0,1,2        0
## 63    63                 D_n     3        0
## 64    64           W_{n}^{2}     3        0
## 65    65           A_{n}^{2}     3        0
## 66    66                 C_n     3        0
## 67    67                 K_n     3        0
## 68    68                 T_1     3        0
## 69    69                 T_2     3        0
## 70    70                G(n)     3        0
## 71    71                   Q     3        0
## 72    72        2nI^{lambda}     3        1
## 73    73                M(n)     3        0
## 74    74         L_{n}^{(m)}     4        1
## 75    75         S_{n}^{(m)}     3        1
## 76    76              H(m,n)     4        1
## 77    77            A^{*}(n)     3        0
## 78    78 D_{n,m}(phi_lambda)     3        2
## 79    79             E_{m,n}     3        1
## 80    80    T_{n,m}^{lambda}     3        2
## 81    81                 Z_A     3        0
## 82    82                 Z_C     3        0
## 83    83              t test 0,1,2        1
## 84    85              DLLap3     3        0
## 85    86  T(alpha_1,alpha_2)     3        1
## 86    87                 T_n     3        0
## 87    88       T^{LS}(alpha)     3        1
## 88    89  T^{LS}_3(mu,alpha)     3        2
## 89    90  T^{LS}_4(alpha,mu)     3        2
## 90    91                GV_1 0,1,2        0
## 91    92                GV_2 0,1,2        0
## 92    93                Ho_K 0,1,2        0
## 93    94                Ho_U 0,1,2        0
## 94    95                Ho_V 0,1,2        0
## 95    96                Ho_W 0,1,2        0
## 96    97                SR^* 0,1,2        0
statcompute(6,calculus_WT$`area.total`, levels = c(0.05), critvalL = NULL, critvalR = NULL,
            alter = 0, stat.pars = NULL)
## $statistic
## [1] 3.274137
## 
## $pvalue
## [1] 0
## 
## $decision
## [1] 1
## 
## $alter
## [1] 3
## 
## $stat.pars
## [1] NA
## 
## $symbol
## [1] "K^2"
statcompute(6,calculus_KO$`area.total`, levels = c(0.05), critvalL = NULL, critvalR = NULL,
            alter = 0, stat.pars = NULL)
## $statistic
## [1] 10.65753
## 
## $pvalue
## [1] 0
## 
## $decision
## [1] 1
## 
## $alter
## [1] 3
## 
## $stat.pars
## [1] NA
## 
## $symbol
## [1] "K^2"

Unpaired unparametric test: Mann-whitney test

set.seed(123)
# Method 1_Base R
## Summary
Summary.area <- calculus %>% 
  group_by(Genotype) %>% 
  summarise(
    count = n(),
    median= median (area.total, na.rm=TRUE),
    IQR = IQR (area.total,na.rm=TRUE),
    IQR25 = quantile(area.total, 0.25),
    IQR75 = quantile(area.total, 0.75)
  )
Summary.area
## # A tibble: 2 × 6
##   Genotype count median   IQR IQR25 IQR75
##   <chr>    <int>  <dbl> <dbl> <dbl> <dbl>
## 1 KO          32   16.7  6.57  13.1  19.7
## 2 WT          32   20.5  6.91  16.6  23.5
## Area difference between WT and KO
stats <- wilcox.test(area.total ~ Genotype, data=calculus,
                     paired=FALSE, correct=TRUE, conf.int=TRUE, conf.level=0.95, exact=FALSE)
stats
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  area.total by Genotype
## W = 303, p-value = 0.005117
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -6.671942 -1.242029
## sample estimates:
## difference in location 
##              -3.749503
# Effect size
library (rstatix)
set.seed(123)
effectsize <- calculus %>% wilcox_effsize(area.total ~ Genotype)
effectsize
## # A tibble: 1 × 7
##   .y.        group1 group2 effsize    n1    n2 magnitude
## * <chr>      <chr>  <chr>    <dbl> <int> <int> <ord>    
## 1 area.total KO     WT       0.351    32    32 moderate
# Method 2_ggpubr_to add it in the figure
## P value
library(ggpubr)
set.seed(123)
compare_means(area.total ~ Genotype, data=calculus,method = "wilcox.test", paired=FALSE, group.by = NULL, ref.group=NULL)
## # A tibble: 1 × 8
##   .y.        group1 group2       p  p.adj p.format p.signif method  
##   <chr>      <chr>  <chr>    <dbl>  <dbl> <chr>    <chr>    <chr>   
## 1 area.total WT     KO     0.00512 0.0051 0.0051   **       Wilcoxon
level_order_genotype <- c("WT","KO")

# Visualization
# install.packages("hrbrthemes")
# library(hrbrthemes)
library(viridis)
figure.area <- ggplot(data=calculus, mapping = aes(y=area.total, x=factor(Genotype,level=level_order_genotype)))+
  geom_boxplot(outlier.shape = NA)+
  theme_bw()+
  geom_jitter(aes(colour=Genotype),
              size=2, alpha=0.9,
              position=position_jitterdodge(jitter.width = 0.5,
                                            jitter.height = 0, 
                                            dodge.width = 0.8)) +
  labs(x="",y=" ",caption = "",
      title = "")+
 theme(legend.position="none",
       axis.text.x=element_text(colour="black", face="plain", size=14), 
       axis.text.y=element_text(colour="black", face="plain", size=14),
       axis.title.x=element_text(margin = margin(t = 5)),
       axis.title.y=element_text(margin = margin(r = 5), size = 16),
       axis.title = element_text(face="plain", size = 10),
        plot.title = element_text(size=10, hjust = 0.5),
        panel.grid = element_blank(),
        panel.border =  element_rect(color = "black",
                                    fill = NA,
                                    size = 1),
        aspect.ratio = 1,
        plot.background = element_rect(fill = "transparent", colour = "transparent"))+
  scale_y_continuous(limits = c(-5,35),breaks = seq(-5,35,10))+
  scale_color_manual(values=c("#7E6148B2","#3C5488B2"))
  

figure.area

# 
#       annotate("text",x=0,y=0.6,
#              label=paste0("Total Area = ",round(as.vector(subset(calculus,Genotype==k,select=area.difference)),2)),
#              hjust=0,size=3)
#   
figure.area.final <- figure.area + 
  annotate("text",x=1.515,y=35,
           label="p = 0.0051", hjust = 0.5, size=4.5)
figure.area.final